Explainable Boosting Machine (EBM) Model
This notebook shows an example of how to use the APIs with the ExplainableBoostingMachine model from the interpret package and how to get a factual explanation for the model in the form of feature importance.
[1]:
# Load and join raw data sources and their metadata.
%run Example_InputDataSources.ipynb
[2]:
# Joined DataFrame.
df_all.head()
[2]:
| id_c | education_background_c | professional_experience_c | skills_c | gender_c | agg_perceived_foreign_c | id_j | education_reqs_j | experience_reqs_role_j | experience_reqs_duration_j | skills_j | gender_j | agg_perceived_foreign_j | ranking | shortlisted | score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | [{'institution': 'Complutense University Of Ma... | [{'institution': 'Stylo Milano', 'start_date':... | [Communications, Social Integration, Microsoft... | Man | No | 5 | [Law Bachelor, Degree In Law, Higher Degree In... | [Consultant] | 12 | [Punctuality, Organization, Accounting, Englis... | Man | No | 4 | 1 | 0.0 |
| 1 | 6 | [{'institution': 'Coronel Rosales Agricultural... | [{'institution': 'Securitas Direct', 'start_da... | [Refinancing, Economy, Microsoft Excel, Collec... | Man | No | 3 | [] | [Sales Assistant, Saleswoman, Commercial Advisor] | 12 | [English, Spanish, Communications, Communicati... | Man | No | 8 | 1 | 0.6 |
| 2 | 10 | [{'institution': 'Complutense University Of Ma... | [{'institution': 'Carrefour Express', 'start_d... | [Entrepreneurship, Literacy, Web Design, Adobe... | Woman | No | 5 | [Law Bachelor, Degree In Law, Higher Degree In... | [Consultant] | 12 | [Punctuality, Organization, Accounting, Englis... | Man | No | 4 | 1 | 0.0 |
| 3 | 11 | [{'institution': 'Les Ribera De Los Molinos', ... | [{'institution': 'Decimas Sl', 'start_date': '... | [Consulting, Sap Crm, Collections, Automation,... | Woman | No | 3 | [] | [Sales Assistant, Saleswoman, Commercial Advisor] | 12 | [English, Spanish, Communications, Communicati... | Man | No | 12 | 0 | 0.4 |
| 4 | 15 | [{'institution': 'Escuela Politcnica Superior ... | [{'institution': 'Reintegrate', 'start_date': ... | [Microsoft Word, Biofuels, English, Entreprene... | Man | No | 3 | [] | [Sales Assistant, Saleswoman, Commercial Advisor] | 12 | [English, Spanish, Communications, Communicati... | Man | No | 5 | 1 | 0.7 |
[3]:
# Joined metadata.
md_all
[3]:
{'id_c':
SCHEMA = {'type': 'number'}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'education_background_c':
SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'degree': {'type': 'string'}, 'duration': {'type': 'string'}}}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'professional_experience_c':
SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'role': {'type': 'string'}, 'duration': {'type': 'string'}}}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'skills_c':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'gender_c':
SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_c':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'id_j':
SCHEMA = {'type': 'number'}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'education_reqs_j':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'experience_reqs_role_j':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'experience_reqs_duration_j':
SCHEMA = {'type': 'number'}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'skills_j':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'gender_j':
SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_j':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'score':
SCHEMA = {'type': 'number'}
ATTR_TYPE = numeric
ATTR_USAGE = target
KNOWLEDGE_BASE = None,
'ranking':
SCHEMA = {'type': 'integer'}
ATTR_TYPE = ordinal
ATTR_USAGE = target
KNOWLEDGE_BASE = None,
'shortlisted':
SCHEMA = {'type': 'integer'}
ATTR_TYPE = category
ATTR_USAGE = target
KNOWLEDGE_BASE = None}
[4]:
# Importing libraries to avoid warnings at running time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
[5]:
from sklearn import set_config
set_config(transform_output = "pandas")
[6]:
# Setting category columns in DataFrame based on metadata.
cat_cols = [k for k, v in md_all.items() if v.attr_type=='category']
df_all[cat_cols] = df_all[cat_cols].astype('category')
# Dataframe metadata.
df_all.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id_c 1643 non-null int64
1 education_background_c 1643 non-null object
2 professional_experience_c 1643 non-null object
3 skills_c 1643 non-null object
4 gender_c 1643 non-null category
5 agg_perceived_foreign_c 1643 non-null category
6 id_j 1643 non-null int64
7 education_reqs_j 1643 non-null object
8 experience_reqs_role_j 1643 non-null object
9 experience_reqs_duration_j 1643 non-null int64
10 skills_j 1643 non-null object
11 gender_j 1643 non-null category
12 agg_perceived_foreign_j 1643 non-null category
13 ranking 1643 non-null int64
14 shortlisted 1643 non-null category
15 score 1643 non-null float64
dtypes: category(5), float64(1), int64(4), object(6)
memory usage: 149.9+ KB
[7]:
# Define ids, target feature(s), and predictive features.
id_cols = ['id_j', 'id_c']
target_cols = ['score', 'ranking', 'shortlisted']
pred_cols = df_all.columns.difference(target_cols + id_cols)
[8]:
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, \
ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary
# Calculated features.
maps_derived_1 = {
(('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
}
maps_derived_2 = {
(('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
(('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
}
# Fitness features about the matching between candidate's features and job's requirements.
maps_matching = {
(('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
(('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
(('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
(('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
(('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
}
# Helper variable for the fitness features
list_cols_fitness = ['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
maps_matching
[8]:
{(('experience_reqs_duration_j', 'role_duration_months_c'),
('fitness_experience',)): MatchOrdinal(),
(('education_reqs_j', 'education_background_c'),
('fitness_education',)): MatchFeatureAtLeastInList(),
(('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
(('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
(('agg_perceived_foreign_j', 'agg_perceived_foreign_c'),
('fitness_foreign',)): MatchBinary()}
[9]:
# Scikit-learn transformation for numeric and categorical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
numeric_features = list_cols_fitness
categorical_features = ['gender_c', 'agg_perceived_foreign_c']
# imputing and scaling numeric features
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")), # Not needed for the used dataset.
("scaler", StandardScaler()) # Not needed for the decision tree, let's keep it for the sake of generality.
]
)
# imputing and encoding categorical features
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
# Not needed for the used dataset, again for the sake of generality.
("encoder", OneHotEncoder()), # Convert to one-hot encoding
]
)
# combining the two above
column_preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
# ("cat", categorical_transformer, categorical_features)
],
)
[10]:
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn
# The pipeline is composed of two phases:
# 1. Preprocessing with metadata (using findhr package)
pipeline_derived = Pipeline(steps=[
("init", AttachMetadata(md_all)),
('mapping_1', DerivedColumn(maps_derived_1)),
('mapping_2', DerivedColumn(maps_derived_2)),
("matching", DerivedColumn(maps_matching)),
# ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
("end", DetachMetadata())
])
# 2. Standard scikit-learn preprocessing to prepare the data for the model covered by column preprocessor.
[11]:
## Pipeline Including ExplainableBoostingRegressor
[12]:
# Pipeline definition for regression model on the target feature "score".
from findhr.preprocess.mapping import AttachMetadata, DerivedColumn, DetachMetadata
from interpret.glassbox import ExplainableBoostingRegressor
pipeline_regr = Pipeline(
steps=[
# first phase: preprocessing with metadata
('fitness_value', pipeline_derived
),
# second phase: preprocessing without metadata (standard scikit-learn)
("column_preprocessor", column_preprocessor),
# model inference
("regressor", ExplainableBoostingRegressor())
]
)
[13]:
# Model fit.
pipeline_regr.fit(df_all.loc[:, pred_cols], df_all.loc[:, 'score'])
[13]:
Pipeline(steps=[('fitness_value',
Pipeline(steps=[('init',
AttachMetadata(metadata_dict={'agg_perceived_foreign_c':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_j':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'education_background_c':
SC...
output_cols=('fitness_skills',))})),
('end', DetachMetadata())])),
('column_preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['fitness_experience',
'fitness_education',
'fitness_skills',
'fitness_gender',
'fitness_foreign'])])),
('regressor', ExplainableBoostingRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('fitness_value',
Pipeline(steps=[('init',
AttachMetadata(metadata_dict={'agg_perceived_foreign_c':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_j':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'education_background_c':
SC...
output_cols=('fitness_skills',))})),
('end', DetachMetadata())])),
('column_preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
['fitness_experience',
'fitness_education',
'fitness_skills',
'fitness_gender',
'fitness_foreign'])])),
('regressor', ExplainableBoostingRegressor())])Pipeline(steps=[('init',
AttachMetadata(metadata_dict={'agg_perceived_foreign_c':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_j':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'education_background_c':
SCHEMA = {'type': 'array', 'items': {'ty...
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('fitness_gender',)),
(('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(input_cols=('skills_j',
'skills_c'),
metadata_dict={'fitness_skills':
SCHEMA = {'type': 'number'}
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('fitness_skills',))})),
('end', DetachMetadata())])AttachMetadata(metadata_dict={'agg_perceived_foreign_c':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'agg_perceived_foreign_j':
SCHEMA = {'enum': ['No', 'Yes', 'Any']}
ATTR_TYPE = category
ATTR_USAGE = sensitive
KNOWLEDGE_BASE = None,
'education_background_c':
SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties...
ATTR_USAGE = target
KNOWLEDGE_BASE = None,
'shortlisted':
SCHEMA = {'type': 'integer'}
ATTR_TYPE = category
ATTR_USAGE = target
KNOWLEDGE_BASE = None,
'skills_c':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'skills_j':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None})DerivedColumn(mappings={(('professional_experience_c', 'experience_reqs_role_j'), ('relevant_exp_role_c',)): RelevantExperienceForRole(input_cols=('professional_experience_c',
'experience_reqs_role_j'),
metadata_dict={'relevant_exp_role_c': {'experience_reqs_role_j':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None,
'professional_experience_c':
SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'role': {'type': 'string'}, 'duration': {'type': 'string'}}}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None}},
output_cols=('relevant_exp_role_c',))})DerivedColumn(mappings={(('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(input_cols=('education_background_c',),
metadata_dict={'degree_list_c':
SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
ATTR_TYPE = object
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('degree_list_c',)),
(('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(input_cols=('relevant_exp_role_c',),
metadata_dict={'role_duration_months_c':
SCHEMA = {'type': 'number', 'minimum': 0}
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('role_duration_months_c',))})DerivedColumn(mappings={(('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary(input_cols=('agg_perceived_foreign_j',
'agg_perceived_foreign_c'),
metadata_dict={'fitness_foreign':
SCHEMA = {'type': 'number'}
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('fitness_foreign',)),
(('education_reqs_j', 'edu...
SCHEMA = {'type': 'number'}
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('fitness_gender',)),
(('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(input_cols=('skills_j',
'skills_c'),
metadata_dict={'fitness_skills':
SCHEMA = {'type': 'number'}
ATTR_TYPE = numeric
ATTR_USAGE = default
KNOWLEDGE_BASE = None},
output_cols=('fitness_skills',))})DetachMetadata()
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler', StandardScaler())]),
['fitness_experience', 'fitness_education',
'fitness_skills', 'fitness_gender',
'fitness_foreign'])])['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
SimpleImputer(strategy='median')
StandardScaler()
ExplainableBoostingRegressor()
[14]:
# Example model prediction.
pipeline_regr.predict(df_all.loc[:10, pred_cols])
[14]:
array([-1.26254049e-17, 6.00000000e-01, -1.23256533e-17, 4.00000000e-01,
7.00000000e-01, -1.23256533e-17, 3.50000000e-01, -1.26254049e-17,
-1.23256533e-17, 7.50000000e-01, -1.23256533e-17])
Example Model Explanation
[15]:
### Get a global explanation from ExplainableBoostingRegressor
explanation_global = pipeline_regr.named_steps['regressor'].explain_global()#name=list_cols_fitness)
[16]:
# Visualize the global explanation through plotting the feature importance.
explanation_global.visualize()
[17]:
# Get the transformed data at the end before the model prediction.
idx_explicand_sample = 0
explicand_sample = df_all.loc[:, pred_cols].iloc[idx_explicand_sample:idx_explicand_sample+1]
transformed_data = pipeline_regr[:-1].transform(explicand_sample)
explanation_local = pipeline_regr.named_steps['regressor'].explain_local(transformed_data)
[18]:
# Visualize the local explanation for the first sample explained.
# See documentation at https://interpret.ml/docs/ebm.html for further details.
explanation_local.visualize(0)
[ ]: